from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
ManyGiftsUK asked us
Explore the data and build models to answer the problems:
-Recommender system: the website homepage offers a wide range of products the user might be interested on
-Cold start: offer relevant products to new customers
Implement adequate evaluation strategies and select an appropriate quality measure
| Phase | Time | Resources | Risks |
|---|---|---|---|
| Business Understanding | 2 days | All analysts | Economic and market changes |
| Data Understanding | 2 days | All analysts | Data problems, technological problems |
| Data Preparation | 2 days | Data scientists, DB engineers | Data problems, technological problems |
| Modeling | 4 days | Data scientists | Technological problems, inability to build adequate model |
| Evaluation | 2 days | All analysts | Economic change inability to implement results |
| Deployment | 2 days | Data scientists, DB engineers, implementation team | Economic change inability to implement results |
| Name | Meaning |
|---|---|
| InvoiceNo | Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation |
| StockCode | Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product |
| Description | Product (item) name. Nominal |
| Quantity | The quantities of each product (item) per transaction. Numeric |
| InvoiceDate | Invoice Date and time. Numeric, the day and time when each transaction was generated |
| UnitPrice | Unit price. Numeric, Product price per unit in pounds |
| CustomerID | Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer |
| Country | Country name. Nominal, the name of the country where each customer resides |
# conda install implicit -c conda-forge -n root
# pip install implicit
import pandas as pd
import numpy as np
import implicit
from scipy import sparse
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ranking_metrics_at_k
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from implicit.evaluation import ranking_metrics_at_k, train_test_split
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#load the dataset
retail = pd.read_csv('retail.csv')
retail.head()
#shape of our dataset
retail.shape
# Correctly encode the variables
retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'])
retail = retail.astype({'CustomerID': object})
#exploring the unique values of each attribute
print("Number of transactions: ", retail['InvoiceNo'].nunique())
print("Number of products: ",retail['StockCode'].nunique())
print("Number of customers:", retail['CustomerID'].nunique() )
print("Percentage of customers NA (new): ", round(retail['CustomerID'].isnull().sum() * 100 / len(retail),2),"%" )
print('Number of countries: ',retail['Country'].nunique())
retail['StockCode'].value_counts().head(30).plot(kind='bar')
retail.groupby(['StockCode'])['Quantity'].sum().sort_values(ascending=False).head(30).plot(kind='bar')
# address incorrect product descriptions
retail['Description'].value_counts()
# check for missing values
retail.isnull().sum()
# check for duplicates
retail.duplicated().sum()
#drop duplicated ones
retail.drop_duplicates(inplace=True)
retail.shape
sns.boxplot(x = retail['UnitPrice'])
# lets check high values
retail[abs(retail['UnitPrice']) > 3000]
#some of these values have 'AMAZON FEE' and are about post offices. lets take a closer look:
retail[retail['Description']=='AMAZON FEE']
#lets drop fees paid to Amazon
retail.drop(retail[retail['Description']=='AMAZON FEE'].index, axis=0, inplace=True)
retail[retail['Description'].str.contains('POSTAGE', na=False)]['Description'].value_counts()
retail[retail['Description'].str.contains('POSTAGE', na=False)]
# descriptions containing 'POSTAGE' are not products and they have different unit prices for same descriptions
#for the purpose of analysis we can drop them
retail.drop(retail[retail['Description'].str.contains('POSTAGE', na=False)].index, axis=0, inplace=True)
#lets check if there still are negative unit prices:
retail[retail['UnitPrice'] < 0]
#they dont provide any information about products, so we can drop them
retail.drop(retail[retail['UnitPrice'] < 0].index, axis=0, inplace=True)
#lets check high values once again:
retail[abs(retail['UnitPrice']) > 3000]
#they don't provide information about products sold, we can drop them too:
retail.drop(retail[abs(retail['UnitPrice']) > 3000].index, axis=0,inplace=True)
sns.boxplot(x = retail['UnitPrice'])
retail[(retail['UnitPrice']==0) & (retail['Description'].isna()==False)]
sns.boxplot(x = retail['Quantity'])
#all negative values have C as a prefix in their invoice number, we will have a closer look at them in next section
retail[retail['Quantity']<0]
retail.isna().sum()
#while examining descriptions above, we found out that some of descriptions are written in lower case and they are not prod name
#lets check them now
low_case=[]
for i in retail['Description'].unique():
if str(i).islower():
low_case.append(i)
low_case.remove(np.nan)
low_case
#they are mostly problematic descriptions
retail[retail['Description'].isin(low_case)]
#all of them has the price of zero, so we can drop them and there are only 493 rows
retail[retail['Description'].isin(low_case)]['UnitPrice'].value_counts()
#drop all lowercase in description
retail.drop(retail[retail['Description'].isin(low_case)].index, axis=0, inplace=True)
up_case=[]
for i in retail['Description'].unique():
if str(i).isupper():
up_case.append(i)
#check the values that are not in upper case and lower case, mixed ones
mixed=[]
for i in retail[retail['Description'].isin(up_case) == False]['Description'].unique():
#some of the descriptions are bigger case except weight indicated with lower case g, to eliminate those check first 3 letters
if str(i)[0:2].isupper() == False:
mixed.append(i)
#exclude the nans
mixed.remove(np.nan)
mixed
retail[(retail['Description'].isin(mixed))]
to_drop = ['?', 'Found', 'Given away', 'Thrown away.', 'mystery! Only ever imported 1800', 'Display', 'Missing',
'damages/credits from ASOS.', 'Not rcvd in 10/11/2010 delivery', 'Thrown away-rusty',
'incorrectly credited C550456 see 47', 'Damaged', 'Water damaged', 'Printing smudges/thrown away',
'Show Samples', 'Damages/samples', 'Adjust bad debt', 'Crushed', '??', 'Found in w/hse', 'Dagamed', 'Incorrect stock entry.',
'Wet pallet-thrown away', 'Had been put aside.', 'Sale error', 'Breakages', 'Marked as 23343', '20713', 'Found by jackie',
'Damages', 'Unsaleable, destroyed.', 'Wrongly mrked had 85123a in box', 'John Lewis', '???']
retail[retail['Description'].isin(to_drop)]
#no unitprice, no customerid, problematic description: should be dropped
#drop problematic descriptions
retail.drop(retail[retail['Description'].isin(to_drop)].index, axis=0, inplace=True)
nan_stock_code = retail[retail['Description'].isna()]['StockCode'].unique()
#fill the nan values with most common value for a stockcode when it has at least one description
for i in nan_stock_code:
if len(retail[(retail['StockCode']==i)]['Description'].value_counts()) != 0:
retail.loc[(retail['StockCode']==i) & (retail['Description'].isna()), 'Description'] = retail[retail['StockCode']==i]['Description'].value_counts().index[0]
retail.isna().sum()
#drop 118 rows with no description provided
retail.dropna(subset=['Description'], inplace=True)
retail.isna().sum()
#create a function that will assign 0 to all variables unless the first character of 'InvoiceNo' in 'C'
def cancel(row):
value = 0
if row['InvoiceNo'][0] == 'C':
value = 1
return value
# Create a new column 'Cancel' to attach to 'data' and set it to the value returned
#by the function cancel().
# The code 'axis=1' makes the apply function process the dataset by row,
#as opposed to by column which is the default option.
retail['Cancel'] = retail.apply(cancel, axis=1)
# get cancelled transactions
cancelled_orders = retail[retail['Cancel']==1]
cancelled_orders
cancelled_orders[cancelled_orders['Quantity']>0]
Negative values in the Quantity column, mean that it's a cancelled quantity because we didn't find any positive value for orders where InvoiceNo contains the prefix C. How much cancelled orders do we have?
#check how many rows our dataframe of cancelled orders contain
print("We have ",cancelled_orders['InvoiceNo'].nunique(), " cancelled orders.")
#percentage of cancelled orders in total orders
total_orders = retail['InvoiceNo'].nunique()
cancelled_number = cancelled_orders['InvoiceNo'].nunique()
print('Percentage of orders canceled: {}/{} ({:.2f}%) '.format(cancelled_number, total_orders, cancelled_number/total_orders*100))
#we will explore only positive not-cancelled orders for old customer for modelling
retail_clean = retail[retail['Quantity'] > 0]
retail_clean = retail_clean.dropna(subset=['CustomerID'])
# Distribution of number of purchases
data1 = retail_clean['CustomerID'].value_counts() # count of events per visitorid
data2 = data1.value_counts(normalize=True)[:9]
data2[10] = data1.value_counts(normalize=True)[9:].sum() # count of counts of events per visitorid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,5))
ax1.boxplot(data1)
ax2.bar(data2.index, data2.values)
ax2.set_xticks(list(range(1,11)))
ax2.set_xticklabels(list(range(1,10)) +['10+'])
fig.suptitle("Distribution of number of visitor events")
plt.show()
print("{0:.2f}% of customers have more than 1 purchase!".format(100 * (np.sum(data1 > 1) / data1.shape[0])))
# Distribution of number of item events
data1 = retail_clean['StockCode'].value_counts() # count of events per item
data2 = data1.value_counts(normalize=True)[:9]
data2[10] = data1.value_counts(normalize=True)[9:].sum() # count of counts of events per visitorid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,5))
ax1.boxplot(data1)
ax2.bar(data2.index, data2.values)
ax2.set_xticks(list(range(1,11)))
ax2.set_xticklabels(list(range(1,10)) +['10+'])
fig.suptitle("Distribution of number of item events")
plt.show()
print("{0:.2f}% of items have more than 1 event!".format(100 * (np.sum(data1 > 1) / data1.shape[0])))
# Create an additional column for date as year and month
retail_clean['Date'] = retail_clean['InvoiceDate'].dt.strftime("%Y-%m")
# Create a new column for the total expenditure of that product in the purchase.
retail_clean['Sales'] = (retail_clean['UnitPrice'] * retail_clean['Quantity'])
#Visualize the variable productsNumber distribution
groupby_invoice = pd.DataFrame(retail_clean.groupby('InvoiceNo')['StockCode'].nunique())
groupby_invoice.columns=['productsNumber']
fig, ax = plt.subplots()
fig.set_size_inches(10, 6)
sns.distplot(groupby_invoice['productsNumber'],ax=ax)
plt.show()
#We have a skewed distribution of products. Most people buy less than 25 items.
fig = px.treemap(retail_clean,
path = ['Country'],
values='Sales')
fig.show()
#top-5 countries
retail_clean.groupby('Country').sum().sort_values(by='Sales', ascending=False)[:5]
retail_clean['CustomerID'].value_counts()
# Visualize number of events per day
df = pd.DatetimeIndex(retail_clean['InvoiceDate']).normalize().value_counts().sort_index()
fig = plt.figure(figsize=(12,6))
plt.plot(df.index, df.values, linestyle="-")
plt.xticks(np.arange(df.index[0], df.index[-1], pd.to_timedelta(7, unit='d')), rotation=90)
plt.title('Event frequency time series')
plt.show()
# How many weeks does the dataset has?
diff = (df.index.max() - df.index.min())
print(f"The dataset has {diff.days} days, corresponding to {diff.days//7} weeks.")
def ABC_analysis(df):
grouped_df = (
df.loc[:, ['CustomerID','Sales']]
.groupby('CustomerID')
.sum()
)
grouped_df = grouped_df.sort_values(by=['Sales'], ascending=False)
## Ranking by importance
grouped_df["Rank"] = grouped_df['Sales'].rank(ascending = False)
grouped_df["Importance"] = ' '
grouped_df = grouped_df.reset_index()
## Checking the Importance of the Customers and Categorising into class A,B,C and splitting based on 20-30-50
grouped_df['Importance'][0: int(0.2 * grouped_df['Rank'].max())] = 'A'
grouped_df['Importance'][int(0.2 * grouped_df['Rank'].max()) : int(0.5 * grouped_df['Rank'].max())] = 'B'
grouped_df['Importance'][int(0.5 * grouped_df['Rank'].max()): ] = 'C'
return grouped_df
ABC_groups = ABC_analysis(retail_clean)
ABC_groups.head()
sns.barplot(x = 'Importance', y = 'Sales',data = ABC_groups)
ABC_groups['Importance'].value_counts()
print("Now let's see importance contribution of each group")
ABC_groups.groupby('Importance')['Sales'].sum() / ABC_groups['Sales'].sum() * 100.
retail_clean['Y-M'] = pd.to_datetime(retail_clean['InvoiceDate']).dt.to_period('M')
retail_clean.groupby(['CustomerID', 'Y-M'])['Y-M'].count()
pt = retail_clean.pivot_table(values='InvoiceNo', index='CustomerID', columns='Y-M', aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
pt.head()
pt['sum'] = pt.sum(axis=1)
customer_freq = pd.DataFrame()
customer_freq['customer'] = pt.index
customer_freq['freq_val'] = pt['sum'].values
customer_freq['group'] = customer_freq['freq_val'].map({1: 'Z', 2: 'Z' , 3: 'Z', 4:'Z', 5:'Y', 6:'Y', 7:'Y', 8:'Y' , 9:'X', 10: 'X', 11:'X', 12:'X'})
sns.barplot(x = customer_freq.groupby(['group']).agg('count').index, y = customer_freq.groupby(['group']).agg('count').values[:,1])
customer_freq['group'].value_counts().sort_values(ascending=True)
customer_item_matrix = retail_clean.pivot_table(
index='CustomerID',
columns='StockCode',
values='Quantity',
aggfunc='sum'
)
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
customer_item_matrix.shape
customer_item_matrix
user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix.columns = customer_item_matrix.index
user_user_sim_matrix['CustomerID'] = customer_item_matrix.index
user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')
user_user_sim_matrix.loc[17935.0].sort_values(ascending=False)
items_bought_by_A = set(customer_item_matrix.loc[12350.0].iloc[customer_item_matrix.loc[12350.0].to_numpy().nonzero()].index)
items_bought_by_A
items_bought_by_B = set(customer_item_matrix.loc[17935.0].iloc[
customer_item_matrix.loc[17935.0].to_numpy().nonzero()
].index)
items_bought_by_B
items_to_recommend_to_B = items_bought_by_A - items_bought_by_B
items_to_recommend_to_B
retail_clean.loc[
retail_clean['StockCode'].isin(items_to_recommend_to_B),
['StockCode', 'Description']
].drop_duplicates().set_index('StockCode')
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))
item_item_sim_matrix.columns = customer_item_matrix.T.index
item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index
item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')
item_item_sim_matrix
top_10_similar_items = list(
item_item_sim_matrix\
.loc['23166']\
.sort_values(ascending=False)\
.iloc[:10]\
.index
)
top_10_similar_items
retail_clean.loc[
retail_clean['StockCode'].isin(top_10_similar_items),
['StockCode', 'Description']
].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]
users_counts= retail_clean['CustomerID'].value_counts()
items_counts = retail_clean['StockCode'].value_counts()
scores = []
for n_events in range(1, 10):
users_counts = users_counts[users_counts > n_events]
items_counts = items_counts[items_counts > n_events]
scores.append(retail_clean.shape[0] / (len(users_counts) * len(items_counts)) * 100)
px.line(x = range(1, 10), y = scores, labels = {'x':'n_events', 'y':'Sparsity'})
def threshold_ratings(df, uid_min, iid_min, max_iter=None):
"""Removes users and items with less than uid_min and iid_min event occurrences, respectively.
Credits: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
"""
n_users = df['CustomerID'].nunique()
n_items = df['StockCode'].nunique()
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Raw dataset info \n-----------------')
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
done, i = False, 0
while not done:
# When we exclude users with freq less than uid_min we might end up with new
# items with freq less than iid_min, so we will have to alternate back and forth
starting_shape = df.shape[0] # number of existing events
uid_counts = df.groupby('CustomerID').size() # user id frequencies
df = df[~df['CustomerID'].isin(uid_counts[uid_counts < uid_min].index.tolist())] # keep events with users with frequency >= uid_min
iid_counts = df.groupby('StockCode').size() # item id frequencies
df = df[~df['StockCode'].isin(iid_counts[iid_counts < iid_min].index.tolist())] # keep events with items with frequency >= iid_min
ending_shape = df.shape[0] # number of existing events after filters
i += 1
if starting_shape == ending_shape or i == max_iter: # convergence happens
done = True
if not max_iter:
assert(df.groupby('CustomerID').size().min() >= uid_min)
assert(df.groupby('StockCode').size().min() >= iid_min)
n_users = df['CustomerID'].nunique()
n_items = df['StockCode'].nunique()
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Limited dataset info \n-----------------')
print('Number of iterations until convergence: {}'.format(i))
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
return df
# get limited dataset
df_limited = threshold_ratings(retail_clean, 10, 10)
We want to split the train and test events such that:
# How many weeks does the dataset has?
diff = (df_limited.InvoiceDate.max() - df_limited.InvoiceDate.min())
print(f"The dataset has {diff.days} days, corresponding to {diff.days//7} weeks.")
# Train-test split
start_train = df_limited['InvoiceDate'].min()
start_test = start_train + pd.to_timedelta(45, unit='w')
end_test = start_test + pd.to_timedelta(5, unit='w')
# Create new limited df
df_limited = df_limited.loc[(df_limited['InvoiceDate'] > start_train) & (df_limited['InvoiceDate'] <= end_test)]
# Create train_split flag
df_limited['train_split'] = (df_limited['InvoiceDate'] <= start_test).astype(int)
print("Proportion of train events: {:.2f}".format(df_limited['train_split'].mean()))
# Visualize train and test set
df = pd.DatetimeIndex(df_limited['InvoiceDate']).normalize().value_counts().sort_index()
fig = plt.figure(figsize=(12,6))
plt.plot(df.index, df.values, linestyle="-")
plt.xticks(np.arange(df.index[0], df.index[-1], pd.to_timedelta(7, unit='d')), rotation=90)
plt.vlines(start_test, 0, df.max(), linestyles='dashed', color='r', label='train-test split')
plt.legend()
plt.title('Event frequency time series - train and test set')
plt.show()
# the Categoricals data structure consists of a categories array and an integer array of codes which point to
# the real value in the categories array
user_cat = df_limited['CustomerID'].astype('category')
item_cat = df_limited['StockCode'].astype("category")
# create a sparse matrix of all the item/user/counts triples for the train set and test set
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix
item_user_train = coo_matrix((df_limited['train_split'],
(item_cat.cat.codes,
user_cat.cat.codes))).tocsr()
item_user_train.eliminate_zeros() # remove zero entries
# produce transpose of item_user_train
user_item_train = item_user_train.T
item_user_test = coo_matrix(((~df_limited['train_split'].astype(bool)).astype(int),
(item_cat.cat.codes,
user_cat.cat.codes))).tocsr()
item_user_test.eliminate_zeros() # remove zero entries
# produce transpose of item_user_test
user_item_test = item_user_test.T
# map each item and user category to a unique numeric code
user_map = dict(zip(user_cat, user_cat.cat.codes))
item_map = dict(zip(item_cat, item_cat.cat.codes))
def get_keys(value, dictionary):
"""Function to get dictionary keys with specifiec value"""
return list(dictionary.keys())[list(dictionary.values()).index(value)]
# confirm shapes
print(f"train set shape: {item_user_train.shape} and test set shape: {item_user_test.shape}")
# check sparsity
pzeros_train = 100 * (1 - item_user_train.count_nonzero() / (item_user_train.shape[0] * item_user_train.shape[1]))
pzeros_test = 100 * (1 - item_user_test.count_nonzero() / (item_user_test.shape[0] * item_user_test.shape[1]))
print(f"train set percentage of zeros: {pzeros_train} and test set percentage of zeros: {pzeros_test}")
# users with no items on the train set and not items on the test set
zero_users_test = (np.squeeze(np.asarray(user_item_test.sum(axis=1))) == 0).nonzero()[0]
zero_users_train = (np.squeeze(np.asarray(user_item_train.sum(axis=1))) == 0).nonzero()[0]
set(zero_users_test).intersection(zero_users_train)
# most frequent user, item pair in train set
item_id, user_id = np.unravel_index(item_user_train.argmax(), item_user_train.shape)
item_id, user_id = get_keys(item_id, item_map), get_keys(user_id, user_map)
df_limited.loc[(df_limited['CustomerID'] == user_id) & (df_limited['StockCode'] == item_id) & (df_limited['train_split'] == 1)]
# initialize a model
alpha = 40 # as we observe more evidence for positive preference, our confidence in pui = 1 increases according to alpha (rate of increase)
als_model = AlternatingLeastSquares(factors=200, regularization=0.01, iterations=30, random_state=0)
# train the model on a sparse matrix of item/user/confidence weights
# os.environ['MKL_NUM_THREADS'] = '1'
# os.environ['OPENBLAS_NUM_THREADS'] = '1'
# about the alpha hyperparameter: https://github.com/benfred/implicit/issues/199#issuecomment-490350326
als_model.fit((item_user_train * alpha).astype('double'))
# recommend items for a user.
# the recommended items have the largest inner product with the user vector
user_id = list(user_map.keys())[0]
recommendations = als_model.recommend(user_map[user_id], user_item_train)
list(map(lambda x: (get_keys(x[0], item_map), x[1]), recommendations))
# find related items
# the related items have the largest inner product with the item vector
item_id = list(item_map.keys())[0]
related = als_model.similar_items(item_map[item_id])
list(map(lambda x: (get_keys(x[0], item_map), x[1]), related))
# show the top 10 items that explain the recommended item to the user
# It is possible to write the LVM as a linear function between preferences and past actions.
# We can then see what are the actions associated with the highest contributions to the given recommendation.
score, contributions, user_weights = als_model.explain(user_map[user_id],
user_item_train,
item_map[item_id])
print("The score of the user/item pair is: ", score)
print("The top N (itemid, score) contributions for this user/item pair are:\n", list(map(lambda x: (get_keys(x[0], item_map), x[1]), contributions)))
# Baseline: Recommend the most popular items to every user
class PopularRecommender():
"""Baseline Recommender that always suggests the most popular items to every user.
"""
def fit(self, item_users):
self.item_id_sort = np.argsort(np.squeeze(np.asarray(item_users.sum(axis=1).reshape(-1))))[::-1]
def recommend(self, userid, user_items, N=10, filter_already_liked_items=None, filter_items=None, recalculate_user=None):
if filter_already_liked_items != None or filter_items != None or recalculate_user != None:
raise NotImplementedError("filter_already_liked_items, filter_items and recalculate_user aren't support yet")
return list(zip(self.item_id_sort[:N], range(1, N + 1)))
# Fitting PopularRecommender model
pop_model = PopularRecommender()
pop_model.fit(item_user_train)
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=200, use_gpu=False, iterations = 120)
bpr_model.fit(item_user_train)
lmf_model = implicit.lmf.LogisticMatrixFactorization(factors=200, use_gpu=False, iterations = 50)
lmf_model.fit(item_user_train)
# Evaluate models.
# Precision at K, Mean Average Precision at K, Normalized Discounted Cumulative Gain at K, AUC at K
eval_models = {'pop_model': pop_model, 'als_model': als_model, 'lmf_model': lmf_model, 'bpr_model': bpr_model}
eval_table = {}
for k, v in eval_models.items():
eval_table[k] = ranking_metrics_at_k(v, user_item_train, user_item_test, K=10, show_progress=True, num_threads=0)
eval_table = pd.DataFrame(eval_table)
eval_table
# first variant - select the 10 most bought products from the last 100 purchases for new customers
data = retail_clean.copy()
data = data.sort_values('InvoiceDate')
recommendations = data.tail(100).sort_values('Quantity').tail(10)['StockCode'].values.tolist()
recommendations
# customers with only 1 purchase
data1 = retail.groupby('CustomerID').filter(lambda x: len(x) == 1)
data2 = retail[retail['CustomerID'].isna()]
new_cust = data1.append(data2)
ratings_utility_matrix = retail.pivot_table(values='Quantity', index='CustomerID', columns='StockCode', fill_value=0)
ratings_utility_matrix.head()
ratings_utility_matrix.shape
X = ratings_utility_matrix.T
X.head()
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape
product_names = list(X.index)
i = X.index[99]
product_ID = product_names.index(i)
product_ID
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape
Recommend = list(X.index[correlation_product_ID > 0.90])
# Removes the item already bought by the customer
Recommend.remove(i)
Recommend[0:9]
itemset = retail[['StockCode','Description']]
itemset['Description']=itemset['Description'].astype(str)
#if not all(c.islower() for c in itemset['Description']):
itemset['Description'] = [(np.where((x.isupper()),x, np.NaN)) for x in itemset['Description']]
itemset[itemset['Description'] =='nan']
# filling NANs with most frequent value
df2 = itemset.groupby('StockCode')['Description'].apply(lambda x: x.fillna(x.mode().iloc[0])).reset_index(drop=True)
df2 = df2.to_frame()
df2[df2['Description']=='nan']
product_descriptions = itemset.dropna()
product_descriptions.drop_duplicates(subset='StockCode',inplace=True)
product_descriptions
desc=product_descriptions['Description'].astype(str)
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(desc)
X1
# Fitting K-Means to the dataset
X=X1
kmeans = KMeans(n_clusters = 10, init = 'k-means++')
y_kmeans = kmeans.fit_predict(X)
plt.plot(y_kmeans, ".")
plt.show()
def print_cluster(i):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print
Recommendation of product based on current product selected by user. Recommend related product based on frequently bought together
# # Optimal clusters is
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X1)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print_cluster(i)
def show_recommendations(product):
#print("Cluster ID:")
Y = vectorizer.transform([product])
prediction = model.predict(Y)
#print(prediction)
print_cluster(prediction[0])
show_recommendations('flower')
from lightfm import LightFM
from lightfm.evaluation import *
counts = df_limited['CustomerID'].value_counts()
item_counts = df_limited['StockCode'].value_counts()
data = df_limited[~df_limited['CustomerID'].isin(counts[counts < 2].index)]
data = data[~data['StockCode'].isin(item_counts[item_counts < 2].index)]
data.StockCode.nunique()
# Train-test split
start_train = data['InvoiceDate'].min()
start_test = start_train + pd.to_timedelta(45, unit='w')
end_test = start_test + pd.to_timedelta(5, unit='w')
# Create new limited df
data = data.loc[(data['InvoiceDate'] > start_train) & (data['InvoiceDate'] <= end_test)]
# Create train_split flag
data['train_split'] = (data['InvoiceDate'] <= start_test).astype(int)
print("Proportion of train events: {:.2f}".format(data['train_split'].mean()))
items = data[['StockCode','Description']]
data_train = data[data['train_split']==1]
data_test = data[data['train_split']==0]
# the Categoricals data structure consists of a categories array and an integer array of codes which point to
# the real value in the categories array
user_cat = data['CustomerID'].astype('category')
item_cat = data['StockCode'].astype("category")
# create a sparse matrix of all the item/user/counts triples for the train set and test set
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix
item_user_train = coo_matrix((data['train_split'],
(item_cat.cat.codes,
user_cat.cat.codes))).tocsr()
item_user_train.eliminate_zeros() # remove zero entries
# produce transpose of item_user_train
user_item_train = item_user_train.T
item_user_test = coo_matrix(((~data['train_split'].astype(bool)).astype(int),
(item_cat.cat.codes,
user_cat.cat.codes))).tocsr()
item_user_test.eliminate_zeros() # remove zero entries
# produce transpose of item_user_test
user_item_test = item_user_test.T
# map each item and user category to a unique numeric code
user_map = dict(zip(user_cat, user_cat.cat.codes))
item_map = dict(zip(item_cat, item_cat.cat.codes))
def get_keys(value, dictionary):
"""Function to get dictionary keys with specifiec value"""
return list(dictionary.keys())[list(dictionary.values()).index(value)]
# confirm shapes
print(f"train set shape: {item_user_train.shape} and test set shape: {item_user_test.shape}")
# check sparsity
pzeros_train = 100 * (1 - item_user_train.count_nonzero() / (item_user_train.shape[0] * item_user_train.shape[1]))
pzeros_test = 100 * (1 - item_user_test.count_nonzero() / (item_user_test.shape[0] * item_user_test.shape[1]))
print(f"train set percentage of zeros: {pzeros_train} and test set percentage of zeros: {pzeros_test}")
ratings_utility_matrix_train = data_train.pivot_table(values='Quantity', index='CustomerID', columns='StockCode', fill_value=0)
ratings_utility_matrix_train.head()
# interactions matrix
interactions = ratings_utility_matrix_train
def create_user_dict(interactions):
'''
Function to create a user dictionary based on their index and number in interaction dataset
Required Input -
interactions - dataset create by create_interaction_matrix
Expected Output -
user_dict - Dictionary type output containing interaction_index as key and user_id as value
'''
user_id = list(interactions.index)
user_dict = {}
counter = 0
for i in user_id:
user_dict[i] = counter
counter += 1
return user_dict
def create_item_dict(df,id_col,name_col):
'''
Function to create an item dictionary based on their item_id and item name
Required Input -
- df = Pandas dataframe with Item information
- id_col = Column name containing unique identifier for an item
- name_col = Column name containing name of the item
Expected Output -
item_dict = Dictionary type output containing item_id as key and item_name as value
'''
item_dict ={}
for i in range(df.shape[0]):
item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
return item_dict
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
'''
Function to run matrix-factorization algorithm
Required Input -
- interactions = dataset create by create_interaction_matrix
- n_components = number of embeddings you want to create to define Item and user
- loss = loss function other options are logistic, brp
- epoch = number of epochs to run
- n_jobs = number of cores used for execution
Expected Output -
Model - Trained model
'''
x = sparse.csr_matrix(interactions.values)
model = LightFM(no_components= n_components, loss=loss,k=k)
model.fit(x,epochs=epoch,num_threads = n_jobs)
return model
def sample_recommendation_user(model, interactions, user_id, user_dict,
item_dict,threshold = 0,nrec_items = 10, show = True):
'''
Function to produce user recommendations
Required Input -
- model = Trained matrix factorization model
- interactions = dataset used for training the model
- user_id = user ID for which we need to generate recommendation
- user_dict = Dictionary type input containing interaction_index as key and user_id as value
- item_dict = Dictionary type input containing item_id as key and item_name as value
- threshold = value above which the rating is favorable in new interaction matrix
- nrec_items = Number of output recommendation needed
Expected Output -
- Prints list of items the given user has already bought
- Prints list of N recommended items which user hopefully will be interested in
'''
n_users, n_items = interactions.shape
user_x = user_dict[user_id]
scores = pd.Series(model.predict(user_x,np.arange(n_items)))
scores.index = interactions.columns
scores = list(pd.Series(scores.sort_values(ascending=False).index))
known_items = list(pd.Series(interactions.loc[user_id,:] \
[interactions.loc[user_id,:] > threshold].index) \
.sort_values(ascending=False))
scores = [x for x in scores if x not in known_items]
return_score_list = scores[0:nrec_items]
known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
if show == True:
print("Known Likes:")
counter = 1
for i in known_items:
print(str(counter) + '- ' + i)
counter+=1
print("\n Recommended Items:")
counter = 1
for i in scores:
print(str(counter) + '- ' + i)
counter+=1
return return_score_list
def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
'''
Funnction to produce a list of top N interested users for a given item
Required Input -
- model = Trained matrix factorization model
- interactions = dataset used for training the model
- item_id = item ID for which we need to generate recommended users
- user_dict = Dictionary type input containing interaction_index as key and user_id as value
- item_dict = Dictionary type input containing item_id as key and item_name as value
- number_of_user = Number of users needed as an output
Expected Output -
- user_list = List of recommended users
'''
n_users, n_items = interactions.shape
x = np.array(interactions.columns)
scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
return user_list
def create_item_emdedding_distance_matrix(model,interactions):
'''
Function to create item-item distance embedding matrix
Required Input -
- model = Trained matrix factorization model
- interactions = dataset used for training the model
Expected Output -
- item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
'''
df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
similarities = cosine_similarity(df_item_norm_sparse)
item_emdedding_distance_matrix = pd.DataFrame(similarities)
item_emdedding_distance_matrix.columns = interactions.columns
item_emdedding_distance_matrix.index = interactions.columns
return item_emdedding_distance_matrix
def item_item_recommendation(item_emdedding_distance_matrix, item_id,
item_dict, n_items = 10, show = True):
'''
Function to create item-item recommendation
Required Input -
- item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
- item_id = item ID for which we need to generate recommended items
- item_dict = Dictionary type input containing item_id as key and item_name as value
- n_items = Number of items needed as an output
Expected Output -
- recommended_items = List of recommended items
'''
recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
sort_values(ascending = False).head(n_items+1). \
index[1:n_items+1]))
if show == True:
print("Item of interest :{0}".format(item_dict[item_id]))
print("Item similar to the above item:")
counter = 1
for i in recommended_items:
print(str(counter) + '- ' + item_dict[i])
counter+=1
return recommended_items
#create user dict
user_dict = create_user_dict(interactions)
items = items.reset_index().drop(columns=['index'])
#create item dict
item_dict = create_item_dict(df = items,
id_col = 'StockCode',
name_col = 'Description')
# building matrix factorization model
mf_model = runMF(interactions = interactions,
n_components = 30,
loss = 'warp',
k = 15,
epoch = 30,
n_jobs = 4)
# user recommender
rec_list = sample_recommendation_user(model = mf_model,
interactions = interactions,
user_id = 12347.0,
user_dict = user_dict,
item_dict = item_dict,
threshold = 4,
nrec_items = 10)
# item-user recommender
sample_recommendation_item(model = mf_model,
interactions = interactions,
item_id = '10002',
user_dict = user_dict,
item_dict = item_dict,
number_of_user = 15)
# item-item recommender
item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
interactions = interactions)
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
item_id = '15060B',
item_dict = item_dict,
n_items = 10)
def runMF2(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
'''
Function to run matrix-factorization algorithm
Required Input -
- interactions = dataset create by create_interaction_matrix
- n_components = number of embeddings you want to create to define Item and user
- loss = loss function other options are logistic, brp
- epoch = number of epochs to run
- n_jobs = number of cores used for execution
Expected Output -
Model - Trained model
'''
model = LightFM(no_components= n_components, loss=loss,k=k)
model.fit(interactions,epochs=epoch,num_threads = n_jobs)
return model
# building matrix factorization model
mf_model2 = runMF2(interactions = item_user_train,
n_components = 30,
loss = 'warp',
k = 15,
epoch = 30,
n_jobs = 4)
#create user dict
user_dict = create_user_dict(interactions)
items = items.reset_index().drop(columns=['index'])
#create item dict
item_dict = create_item_dict(df = items,
id_col = 'StockCode',
name_col = 'Description')
# building matrix factorization model
mf_model2 = runMF2(interactions = item_user_train,
n_components = 30,
loss = 'warp',
k = 15,
epoch = 30,
n_jobs = 4)
# user recommender
rec_list = sample_recommendation_user(model = mf_model2,
interactions = interactions,
user_id = 12347.0,
user_dict = user_dict,
item_dict = item_dict,
threshold = 4,
nrec_items = 10)
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
item_id = '15060B',
item_dict = item_dict,
n_items = 10)
# Evaluate the trained model
prec = precision_at_k(mf_model2, item_user_test, train_interactions=None, k=10).mean()
rec = recall_at_k(mf_model2, item_user_test, train_interactions=None, k=10).mean()
auc = auc_score(mf_model2, item_user_test, train_interactions=None).mean()
print(prec)
print(rec)
print(auc)